# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from allitbooks.items import BookItem
import time
import sys
import traceback

from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher

class BookbotSpider(scrapy.Spider):
    def __init__(self):
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_closed(self, spider):
        print("book counting:", BookbotSpider.total_books_count)

    name = 'bookbot'
    allowed_domains = ['www.allitebooks.com']
    start_urls = ['http://www.allitebooks.com/']
    interested_topic = ['database', 'programming', 'operating', 'computer']
    top_topic = []
    total_books_count = 0

    def parse(self, response):
        CLOSESPIDER_ERRORCOUNT = 1
        # get all main topic
        top_topic = response.css('div ul#menu-categories li a::text').extract()
        topic_url = response.css('div ul#menu-categories li a::attr(href)').extract()
        for url in topic_url:
            yield Request(url=url, callback=self.parse_one_topic)
        
    def parse_one_topic(self, response):
        total_page = int(response.css('div.pagination a::text').extract()[-1])
        print("url: ", response.url, ", total_page: ", total_page)
        for p in range(0, total_page):
            url = self.start_urls[0] + 'page/' + str(p)
            links = response.css('h2.entry-title a::attr(href)').extract()
            for l in links:
                yield Request(l, self.parse_one_book)
                
    def parse_one_book(self, response):
        try:
            name = response.css('header.entry-header h1::text').extract()[0].strip()
            # following-sibling可能有多个，但一般author处于第一位
            author = response.css('div.book-detail dl').xpath('.//dt[text()="Author:"]/following-sibling::dd')[0].css('a::text').extract()[0]

            year = response.css('div.book-detail dl').xpath('.//dt[text()="Year:"]/following-sibling::dd')[0].css('dd::text').extract()[0].strip()
            page = int(response.css('div.book-detail dl').xpath('.//dt[text()="Pages:"]/following-sibling::dd')[0].css('dd::text').extract()[0].strip())
            filesize = float(response.css('div.book-detail dl').xpath('.//dt[text()="File size:"]/following-sibling::dd')[0].css('dd::text').extract()[0].replace('MB', ' ').strip())
            category = str(response.css('div.book-detail dl').xpath('.//dt[text()="Category:"]/following-sibling::dd')[0].css('a::text').extract()[0].strip())
            
            # topic不方便获取，一本书可以属于多个top_topic

            # 链接中可能有空格，浏览器可以自动转换，不行的话手动替换成%20
            download_url = response.css('span.download-links>a::attr(href)').extract_first()
            #print("book attribute:", name, year, page, filesize, category, author, download_url)
        except Exception as err:
            traceback.print_exc()
            print('error: ', err , ', url:', response.url)

        BookbotSpider.total_books_count = BookbotSpider.total_books_count + 1
       
        item = BookItem()
        item['name'] = name
        item['year'] = year
        item['author'] = author
        item['page'] = page
        item['filesize'] = filesize #MB
        item['category'] = category
        item['url'] = response.url
        item['download_url'] = download_url
        #item['topic']
        yield item